Group Project

library("ggplot2")
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.4     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library('geosphere')
library("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.

0. Initial Setup

Importing Data

# Reading in the sample CSV of rider data we made
rider_2019_sample <- read.csv('sample.csv', stringsAsFactors = TRUE)
head(rider_2019_sample)
##   tripduration                starttime                 stoptime
## 1          564 2019-04-11 07:44:36.0580 2019-04-11 07:54:00.1840
## 2         1158 2019-05-15 18:00:33.3890 2019-05-15 18:19:52.0150
## 3          763 2019-03-25 13:27:50.4260 2019-03-25 13:40:33.7960
## 4          915 2019-06-21 15:52:07.8340 2019-06-21 16:07:23.6810
## 5         1368 2019-06-01 18:42:22.9500 2019-06-01 19:05:11.7510
## 6          267 2019-07-31 18:47:05.5630 2019-07-31 18:51:33.0870
##   start.station.id       start.station.name start.station.latitude
## 1             3711       E 13 St & Avenue A               40.72967
## 2             3016        Kent Ave & N 7 St               40.72037
## 3              382  University Pl & E 14 St               40.73493
## 4              359       E 47 St & Park Ave               40.75510
## 5             3295 Central Park W & W 96 St               40.79127
## 6             3377     Carroll St & Bond St               40.67861
##   start.station.longitude end.station.id  end.station.name end.station.latitude
## 1               -73.98068            168   W 18 St & 6 Ave             40.73971
## 2               -73.96165           3016 Kent Ave & N 7 St             40.72037
## 3               -73.99201            459  W 20 St & 11 Ave             40.74674
## 4               -73.97499            483   E 12 St & 3 Ave             40.73223
## 5               -73.96484           3142   1 Ave & E 62 St             40.76123
## 6               -73.99037           3398   Smith St & 9 St             40.67470
##   end.station.longitude bikeid   usertype birth.year gender
## 1             -73.99456  29807 Subscriber       1994      1
## 2             -73.96165  34411 Subscriber       1974      1
## 3             -74.00776  16078 Subscriber       1961      1
## 4             -73.98890  29904 Subscriber       1964      2
## 5             -73.96094  30247   Customer       1969      0
## 6             -73.99786  20315 Subscriber       1971      1
# Reading in the weather data set
weather_data <- read.csv('NYCWeather2019.csv', stringsAsFactors = TRUE)
head(weather_data)
##       STATION                        NAME     DATE AWND PRCP SNOW SNWD TAVG
## 1 USW00094728 NY CITY CENTRAL PARK, NY US 1/1/2019   NA 0.06    0    0   NA
## 2 USW00094728 NY CITY CENTRAL PARK, NY US 1/2/2019   NA 0.00    0    0   NA
## 3 USW00094728 NY CITY CENTRAL PARK, NY US 1/3/2019   NA 0.00    0    0   NA
## 4 USW00094728 NY CITY CENTRAL PARK, NY US 1/4/2019   NA 0.00    0    0   NA
## 5 USW00094728 NY CITY CENTRAL PARK, NY US 1/5/2019   NA 0.50    0    0   NA
## 6 USW00094728 NY CITY CENTRAL PARK, NY US 1/6/2019   NA 0.00    0    0   NA
##   TMAX TMIN
## 1   58   39
## 2   40   35
## 3   44   37
## 4   47   35
## 5   47   41
## 6   49   31

Data Summary

# Initial summary of rider data set
str(rider_2019_sample)
## 'data.frame':    100000 obs. of  15 variables:
##  $ tripduration           : int  564 1158 763 915 1368 267 661 1112 520 512 ...
##  $ starttime              : Factor w/ 99999 levels "2019-01-01 00:56:30.7720",..: 18803 28405 14066 41002 34169 54789 95279 5247 68397 75686 ...
##  $ stoptime               : Factor w/ 100000 levels "2019-01-01 01:34:45.0200",..: 18804 28409 14065 41001 34174 54787 95282 5246 68395 75682 ...
##  $ start.station.id       : Factor w/ 825 levels "116","119","120",..: 621 86 688 538 263 348 749 80 259 545 ...
##  $ start.station.name     : Factor w/ 894 levels "1 Ave & E 110 St",..: 352 545 760 386 250 234 797 672 440 99 ...
##  $ start.station.latitude : num  40.7 40.7 40.7 40.8 40.8 ...
##  $ start.station.longitude: num  -74 -74 -74 -74 -74 ...
##  $ end.station.id         : Factor w/ 828 levels "116","119","120",..: 15 86 752 774 184 369 623 27 333 509 ...
##  $ end.station.name       : Factor w/ 890 levels "1 Ave & E 110 St",..: 793 549 795 350 7 714 787 371 598 92 ...
##  $ end.station.latitude   : num  40.7 40.7 40.7 40.7 40.8 ...
##  $ end.station.longitude  : num  -74 -74 -74 -74 -74 ...
##  $ bikeid                 : int  29807 34411 16078 29904 30247 20315 40128 33989 29972 20897 ...
##  $ usertype               : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 1 2 1 2 2 2 ...
##  $ birth.year             : int  1994 1974 1961 1964 1969 1971 1969 1960 1972 1966 ...
##  $ gender                 : int  1 1 1 2 0 1 0 1 1 1 ...
summary(rider_2019_sample)
##   tripduration                          starttime    
##  Min.   :     61.0   2019-11-22 17:59:58.4760:    2  
##  1st Qu.:    362.0   2019-01-01 00:56:30.7720:    1  
##  Median :    614.0   2019-01-01 01:35:30.5010:    1  
##  Mean   :    950.8   2019-01-01 02:04:41.7180:    1  
##  3rd Qu.:   1075.0   2019-01-01 02:25:28.9700:    1  
##  Max.   :2769536.0   2019-01-01 02:33:50.6550:    1  
##                      (Other)                 :99993  
##                      stoptime     start.station.id
##  2019-01-01 01:34:45.0200:    1   519    :  810   
##  2019-01-01 01:51:55.8730:    1   3255   :  617   
##  2019-01-01 02:13:13.4810:    1   497    :  602   
##  2019-01-01 02:29:13.1090:    1   402    :  561   
##  2019-01-01 03:04:23.8640:    1   435    :  551   
##  2019-01-01 04:09:48.6020:    1   (Other):96523   
##  (Other)                 :99994   NA's   :  336   
##              start.station.name start.station.latitude start.station.longitude
##  Pershing Square North:  810    Min.   :40.66          Min.   :-74.03         
##  8 Ave & W 31 St      :  617    1st Qu.:40.72          1st Qu.:-74.00         
##  E 17 St & Broadway   :  602    Median :40.74          Median :-73.98         
##  Broadway & E 22 St   :  561    Mean   :40.74          Mean   :-73.98         
##  W 21 St & 6 Ave      :  551    3rd Qu.:40.76          3rd Qu.:-73.97         
##  Broadway & E 14 St   :  548    Max.   :40.85          Max.   :-73.88         
##  (Other)              :96311                                                  
##  end.station.id               end.station.name end.station.latitude
##  519    :  792   Pershing Square North:  792   Min.   :40.66       
##  402    :  636   Broadway & E 22 St   :  636   1st Qu.:40.72       
##  3255   :  632   8 Ave & W 31 St      :  632   Median :40.74       
##  497    :  623   E 17 St & Broadway   :  623   Mean   :40.74       
##  285    :  547   Broadway & E 14 St   :  547   3rd Qu.:40.76       
##  (Other):96426   W 21 St & 6 Ave      :  544   Max.   :40.86       
##  NA's   :  344   (Other)              :96226                       
##  end.station.longitude     bikeid            usertype       birth.year  
##  Min.   :-74.03        Min.   :14529   Customer  :14054   Min.   :1885  
##  1st Qu.:-74.00        1st Qu.:25346   Subscriber:85946   1st Qu.:1970  
##  Median :-73.99        Median :30918                      Median :1983  
##  Mean   :-73.98        Mean   :29674                      Mean   :1980  
##  3rd Qu.:-73.97        3rd Qu.:35049                      3rd Qu.:1990  
##  Max.   :-73.89        Max.   :42046                      Max.   :2003  
##                                                                         
##      gender     
##  Min.   :0.000  
##  1st Qu.:1.000  
##  Median :1.000  
##  Mean   :1.161  
##  3rd Qu.:1.000  
##  Max.   :2.000  
## 
# Initial summart of weather data set
str(weather_data)
## 'data.frame':    365 obs. of  10 variables:
##  $ STATION: Factor w/ 1 level "USW00094728": 1 1 1 1 1 1 1 1 1 1 ...
##  $ NAME   : Factor w/ 1 level "NY CITY CENTRAL PARK, NY US": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DATE   : Factor w/ 365 levels "1/1/2019","1/10/2019",..: 1 12 23 26 27 28 29 30 31 2 ...
##  $ AWND   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PRCP   : num  0.06 0 0 0 0.5 0 0 0.17 0.06 0 ...
##  $ SNOW   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNWD   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TAVG   : logi  NA NA NA NA NA NA ...
##  $ TMAX   : int  58 40 44 47 47 49 34 45 45 34 ...
##  $ TMIN   : int  39 35 37 35 41 31 25 34 34 28 ...
summary(rider_2019_sample)
##   tripduration                          starttime    
##  Min.   :     61.0   2019-11-22 17:59:58.4760:    2  
##  1st Qu.:    362.0   2019-01-01 00:56:30.7720:    1  
##  Median :    614.0   2019-01-01 01:35:30.5010:    1  
##  Mean   :    950.8   2019-01-01 02:04:41.7180:    1  
##  3rd Qu.:   1075.0   2019-01-01 02:25:28.9700:    1  
##  Max.   :2769536.0   2019-01-01 02:33:50.6550:    1  
##                      (Other)                 :99993  
##                      stoptime     start.station.id
##  2019-01-01 01:34:45.0200:    1   519    :  810   
##  2019-01-01 01:51:55.8730:    1   3255   :  617   
##  2019-01-01 02:13:13.4810:    1   497    :  602   
##  2019-01-01 02:29:13.1090:    1   402    :  561   
##  2019-01-01 03:04:23.8640:    1   435    :  551   
##  2019-01-01 04:09:48.6020:    1   (Other):96523   
##  (Other)                 :99994   NA's   :  336   
##              start.station.name start.station.latitude start.station.longitude
##  Pershing Square North:  810    Min.   :40.66          Min.   :-74.03         
##  8 Ave & W 31 St      :  617    1st Qu.:40.72          1st Qu.:-74.00         
##  E 17 St & Broadway   :  602    Median :40.74          Median :-73.98         
##  Broadway & E 22 St   :  561    Mean   :40.74          Mean   :-73.98         
##  W 21 St & 6 Ave      :  551    3rd Qu.:40.76          3rd Qu.:-73.97         
##  Broadway & E 14 St   :  548    Max.   :40.85          Max.   :-73.88         
##  (Other)              :96311                                                  
##  end.station.id               end.station.name end.station.latitude
##  519    :  792   Pershing Square North:  792   Min.   :40.66       
##  402    :  636   Broadway & E 22 St   :  636   1st Qu.:40.72       
##  3255   :  632   8 Ave & W 31 St      :  632   Median :40.74       
##  497    :  623   E 17 St & Broadway   :  623   Mean   :40.74       
##  285    :  547   Broadway & E 14 St   :  547   3rd Qu.:40.76       
##  (Other):96426   W 21 St & 6 Ave      :  544   Max.   :40.86       
##  NA's   :  344   (Other)              :96226                       
##  end.station.longitude     bikeid            usertype       birth.year  
##  Min.   :-74.03        Min.   :14529   Customer  :14054   Min.   :1885  
##  1st Qu.:-74.00        1st Qu.:25346   Subscriber:85946   1st Qu.:1970  
##  Median :-73.99        Median :30918                      Median :1983  
##  Mean   :-73.98        Mean   :29674                      Mean   :1980  
##  3rd Qu.:-73.97        3rd Qu.:35049                      3rd Qu.:1990  
##  Max.   :-73.89        Max.   :42046                      Max.   :2003  
##                                                                         
##      gender     
##  Min.   :0.000  
##  1st Qu.:1.000  
##  Median :1.000  
##  Mean   :1.161  
##  3rd Qu.:1.000  
##  Max.   :2.000  
## 

Adjusting Dates and Times in Data Sets

# Creating columns of just month, day, and year
weather_data$DATE <- as.Date(weather_data$DATE, format = "%m/%d/%Y")
weather_data$Month <- format(weather_data$DATE, "%m")
weather_data$Day <- format(weather_data$DATE, "%d")
weather_data$Year <- format(weather_data$DATE, "%Y")
# Creating columns of just month, day, and year
rider_2019_sample$DATE <- as.Date(rider_2019_sample$starttime, format = "%Y-%m-%d")
rider_2019_sample$Month <- format(rider_2019_sample$DATE, "%m")
rider_2019_sample$Day <- format(rider_2019_sample$DATE, "%d")
rider_2019_sample$Year <- format(rider_2019_sample$DATE, "%Y")
dates <- as.POSIXct(rider_2019_sample$stoptime)
rider_2019_sample$endtime <- format(dates, format = "%H:%M")

Rider Age

rider_2019_sample$age <- 2019 - as.numeric(as.character(rider_2019_sample$birth.year))
rider_2019_sample <- filter(rider_2019_sample, age <= 80)

Combining Data Sets

# Combining data frames to compare data
edited_weather <- select(weather_data,
                         PRCP,
                         SNOW,
                         AWND,
                         DATE)
edited_rider <- select(rider_2019_sample, 
                       age,
                       gender,
                       usertype,
                       tripduration,
                       start.station.latitude,
                       start.station.longitude,
                       start.station.id,
                       start.station.name,
                       end.station.latitude,
                       end.station.longitude,
                       end.station.id,
                       end.station.name,
                       DATE,
                       Day,
                       Month,
                       Year,
                       endtime)

total_data = merge(edited_weather, edited_rider, by.x="DATE", by.y="DATE", all.x=TRUE)
head(total_data)
##         DATE PRCP SNOW AWND age gender   usertype tripduration
## 1 2019-01-01 0.06    0   NA  52      1 Subscriber         1166
## 2 2019-01-01 0.06    0   NA  33      1 Subscriber          532
## 3 2019-01-01 0.06    0   NA  55      1 Subscriber          263
## 4 2019-01-01 0.06    0   NA  29      1 Subscriber          196
## 5 2019-01-01 0.06    0   NA  28      1 Subscriber          710
## 6 2019-01-01 0.06    0   NA  37      2 Subscriber          312
##   start.station.latitude start.station.longitude start.station.id
## 1               40.72037               -73.96165             3016
## 2               40.67583               -73.95617             3569
## 3               40.74517               -73.98683              474
## 4               40.72308               -73.98584             3656
## 5               40.75187               -73.97771              519
## 6               40.71422               -73.98135              502
##            start.station.name end.station.latitude end.station.longitude
## 1           Kent Ave & N 7 St             40.72080             -73.95485
## 2 Franklin Ave & St Marks Ave             40.69073             -73.95133
## 3             5 Ave & E 29 St             40.74034             -73.98955
## 4           E 2 St & Avenue A             40.72087             -73.98086
## 5       Pershing Square North             40.73222             -73.98166
## 6         Henry St & Grand St             40.72217             -73.98369
##   end.station.id             end.station.name Day Month Year endtime
## 1           3101        N 12 St & Bedford Ave  01    01 2019   14:56
## 2           3056 Kosciuszko St & Nostrand Ave  01    01 2019   04:09
## 3            402           Broadway & E 22 St  01    01 2019   21:56
## 4            150            E 2 St & Avenue C  01    01 2019   11:44
## 5            504              1 Ave & E 16 St  01    01 2019   18:00
## 6            301            E 2 St & Avenue B  01    01 2019   16:35

Distance Between Stations

# Distance between start and end station in Meters
total_data <- mutate(total_data, 
                            distance = distHaversine(cbind(total_data$start.station.longitude,
                                                           total_data$start.station.latitude),
                                                     cbind(total_data$end.station.longitude,
                                                           total_data$end.station.latitude)))
total_data <- filter(total_data, tripduration <= 3600)
head(total_data)
##         DATE PRCP SNOW AWND age gender   usertype tripduration
## 1 2019-01-01 0.06    0   NA  52      1 Subscriber         1166
## 2 2019-01-01 0.06    0   NA  33      1 Subscriber          532
## 3 2019-01-01 0.06    0   NA  55      1 Subscriber          263
## 4 2019-01-01 0.06    0   NA  29      1 Subscriber          196
## 5 2019-01-01 0.06    0   NA  28      1 Subscriber          710
## 6 2019-01-01 0.06    0   NA  37      2 Subscriber          312
##   start.station.latitude start.station.longitude start.station.id
## 1               40.72037               -73.96165             3016
## 2               40.67583               -73.95617             3569
## 3               40.74517               -73.98683              474
## 4               40.72308               -73.98584             3656
## 5               40.75187               -73.97771              519
## 6               40.71422               -73.98135              502
##            start.station.name end.station.latitude end.station.longitude
## 1           Kent Ave & N 7 St             40.72080             -73.95485
## 2 Franklin Ave & St Marks Ave             40.69073             -73.95133
## 3             5 Ave & E 29 St             40.74034             -73.98955
## 4           E 2 St & Avenue A             40.72087             -73.98086
## 5       Pershing Square North             40.73222             -73.98166
## 6         Henry St & Grand St             40.72217             -73.98369
##   end.station.id             end.station.name Day Month Year endtime  distance
## 1           3101        N 12 St & Bedford Ave  01    01 2019   14:56  576.0106
## 2           3056 Kosciuszko St & Nostrand Ave  01    01 2019   04:09 1707.3540
## 3            402           Broadway & E 22 St  01    01 2019   21:56  584.0158
## 4            150            E 2 St & Avenue C  01    01 2019   11:44  486.4067
## 5            504              1 Ave & E 16 St  01    01 2019   18:00 2213.1388
## 6            301            E 2 St & Avenue B  01    01 2019   16:35  907.8033

1. Initial Data Analysis

Gender Split in Riders

# Reclassifying the genders
# 0=unknown, 1=male, 2=female
total_data$gender <- ifelse(total_data$gender == 0, "Unknown",
                                  ifelse(total_data$gender == 1, "Male", "Female"))

# Seeing the split of genders who rented bikes
total_data %>%
  ggplot(aes(x=gender, fill=gender)) +
  geom_bar() + theme(legend.position="none") +
  ggtitle("Bike Rental Counts by Gender")

From this bar graph, we can see the majority of people who rent bikes are males. Unknown are users who did not input their gender, meaning they are a mix of males, females, and other genders.

Subscriber vs Customer for Riders

# Seeing the split of user type who rented bikes
total_data %>%
  ggplot(aes(x=usertype, fill=usertype)) +
  geom_bar() +
  theme(legend.position="none") + 
  ggtitle("Bike Rental Counts by User Type")

The majority of bike renters are subscribers. This make sense as the subscribers probably use the bikes as everyday transportation and a subscription service is probably cheaper than renting a bike each time. Customers are probably people who just want to enjoy a bike ride once in a while or tourists, where a subscription service wouldn’t make sense.

Trip Duration

# Range of all bike rides
trip_duration_stats <- filter(total_data) %>%
# min range of tripduration  
  summarise(duration_range_min = min(tripduration, na.rm=TRUE), 
# max range of tripduration
            duration_range_max = max(tripduration, na.rm=TRUE), 
# average length of bike ride
            duration_mean = mean(tripduration, na.rm=TRUE), 
# standard deviation of bike ride
            duration_sd = sd(tripduration, na.rm=TRUE))

2. Ride History Pattern Analysis

Speed of Rider Demographics

# Speed of the rider
total_data$speed <- total_data$distance/total_data$tripduration

# Average speed of all riders
all_ride <- total_data %>%
  summarise(average_speed = mean(speed))

# Average speed of young riders
young_ride <- total_data %>%
  filter(age <= 45) %>%
  summarise(young_average = mean(speed))

# Average speed of old riders
old_ride <- total_data %>%
  filter(age >= 65) %>%
  summarise(old_average = mean(speed))

# Average speed of female riders
fem_ride <- total_data %>%
  filter(gender == "Female") %>%
  summarise(female_average = mean(speed))

# Average speed of male riders
male_ride <- total_data %>%
  filter(gender == "Male") %>%
  summarise(male_average = mean(speed))

# Average speed of subscribers
sub_ride <- total_data %>%
  filter(usertype == "Customer") %>%
  summarise(customer_average = mean(speed))

# Average speed of customers
cust_ride <- total_data %>%
  filter(usertype == "Subscriber") %>%
  summarise(subscriber_average = mean(speed))

cbind(all_ride, young_ride, old_ride, fem_ride, male_ride, sub_ride, cust_ride)
##   average_speed young_average old_average female_average male_average
## 1      2.462556      2.538826     2.19201       2.326377     2.572124
##   customer_average subscriber_average
## 1         1.801693           2.565832

Speed of Riders Affected by Weather

# Speed affected by rain
rain_speed <- total_data %>%
  filter(PRCP > 0, SNOW == 0) %>%
  summarise(rain_average = mean(speed))

# Speed affected by snow
snow_speed <- total_data %>%
  filter(SNOW > 0) %>%
  summarise(snow_average = mean(speed))

# Speed affected by wind
wind_speed <- total_data %>%
  filter(PRCP == 0, SNOW == 0, AWND > 0) %>%
  summarise(wind_average = mean(speed))

cbind(rain_speed, snow_speed, wind_speed)
##   rain_average snow_average wind_average
## 1     2.481313     2.554332     2.430265
# Scatter Plot of speed by age
total_data %>%
  ggplot(aes(x = age, y = speed, colour = gender)) +
  geom_point(alpha = .4, size = 1.5) +
  scale_colour_manual(name = 'Gender',
                      values = setNames(c('blue','magenta', 'dark green'),
                                        c('Male', 'Female', 'Unknown'))) +
  geom_smooth(method='lm', colour = 'black') +
  facet_wrap(~gender)  + # make a plot per gender. look up facet_wrap for other fun ways to do this 
  labs(title="Average Speed of Riders by Age", x="Age", y="Speed")
## `geom_smooth()` using formula 'y ~ x'

# Boxplot of speed by gender
total_data %>%
  ggplot(aes(x = gender, y = speed, colour = gender)) +
  geom_boxplot(outlier.colour = 'red') +
  scale_colour_manual(name = 'Gender',
                      values = setNames(c('blue','magenta', 'dark green'),
                                        c('Male', 'Female', 'Unknown'))) +
  labs(title="Speed of Riders by Gender", x="Gender", y="Speed")

From the scatter plots, we can see that all genders saw a decrease in speed as riders got older. The drop off was most steep for the Unkown gender group, but since Unknown is a mix of all genders, it’s hard to make any analyses for the group. Males had a steeper decline in speed as they got older than Females, but from the box plots, we can see that Males were faster than Females on average.

# Boxplot of speed by customer type
total_data %>%
  ggplot(aes(x = usertype, y = speed, colour = usertype)) +
  geom_boxplot(outlier.colour = 'red') +
  scale_colour_manual(name = 'User Type',
                      values = setNames(c('purple', 'orange'),
                                        c('Subscriber', 'Customer'))) +
  labs(title="Speed of Riders by Customer Type", x="Customer Type", y="Speed")

From these box plots, we see that Subscribers ride substantially faster than Customers. This could be because Subscribers use the bike for important purposes like going to work everyday, while Customers are most likely tourists or people riding for leisure who just want to take their time riding through the city.

Start Location Preferences

top_height <- max(total_data$start.station.latitude) - min(total_data$start.station.latitude)
top_width <- max(total_data$start.station.longitude) - min(total_data$start.station.longitude)
top_borders <- c(bottom  = min(total_data$start.station.latitude)  - 0.1 * top_height,
                 top     = max(total_data$start.station.latitude)  + 0.1 * top_height,
                 left    = min(total_data$start.station.longitude) - 0.2 * top_width,
                 right   = max(total_data$start.station.longitude) + 0.2 * top_width)

start <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/12/1205/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1541.png
start_map <- ggmap(start, extent = "device", legend = "topright")

start_map + stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..,), size = 2, bins = 10, data = total_data, geom = "polygon",  na.rm=TRUE,
) + labs( fill = "Density", title = "Start Location Density") + guides(alpha = F)

This graph shows that most bike trips in 2019 start in the center of NYC, with relatively few in the boroughs by comparison.

Start Locations by Day of Week

# convert dates to weekdays
total_data$day_of_week = weekdays(total_data$DATE)

start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
size =2, bins = 10, geom = "polygon", data = total_data) + 
guides(alpha = F) + labs(fill = "Density", title = "Start Location Density by Day of Week") +
scale_fill_gradient(low = "yellow", high = "red") +
facet_wrap(~ day_of_week) + 
theme(legend.position = "right")

From these charts, we can see that the Saturday and Sunday location densities are slightly more spread than the weekdays, suggesting that the weekend trips are less concentrated in the inner city, albeit still focused in Manhattan.

Start Locations by User Type

start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
size =2, bins = 10, geom = "polygon", data = total_data) + 
guides(alpha = F) + labs(fill = "Density", title = "Start Location Density by User Type") +
scale_fill_gradient(low = "yellow", high = "red") +
facet_wrap(~ usertype) + 
theme(legend.position = "right")

This graph shows that there is larger focus on downtown start locations for Subscribers, whereas Customers are spread out along Manhattan and are present in the boroughs as well.

Start Locations by Trip Duration

# Creating a data frame where trip duration is one SD and above
one_sd_above <- as.numeric(trip_duration_stats[1, 3]) + as.numeric(trip_duration_stats[1, 4])
sd_above_data <- total_data %>%
    filter(trip_duration_stats[1, 3] <= tripduration, tripduration <= one_sd_above)

# Creating a data frame where trip duration is one SD and below
one_sd_below <- as.numeric(trip_duration_stats[1, 3]) - as.numeric(trip_duration_stats[1, 4])
sd_below_data <- total_data %>%
    filter(one_sd_below <= tripduration, tripduration <= trip_duration_stats[1, 3])

# Map with total riders trip duration
ggmap(start) +
  geom_point(data = total_data, mapping = aes(x = start.station.longitude, y = start.station.latitude,
                                        col = tripduration)) +
  scale_color_gradient(low = "yellow", high = "red") +
  labs(title = "Start Location by Trip Duration")

# Map with trip duration one SD above
ggmap(start) +
  geom_point(data = sd_above_data, mapping = aes(x = start.station.longitude, y = start.station.latitude,
                                        col = tripduration)) +
  scale_color_gradient(low = "yellow", high = "red") +
  labs(title = "Start Location by Trip Duration One SD Above Mean")

# Map with trip duration one SD below
ggmap(start) +
  geom_point(data = sd_below_data, mapping = aes(x = start.station.longitude, y = start.station.latitude,
                                        col = tripduration)) +
  scale_color_gradient(low = "yellow", high = "red") +
  labs(title = "Start Location by Trip Duration One SD Below Mean")

These graphs allow us to pinpoint how long riders are riding for depending on their start location. The edges of the city tend to have higher red dots for the trip duration one SD below the mean and the inner part of the city has higher red dots for th trip duration one SD above the mean; this could show that people who are on the outskirts of the city don’t have to travel as far as those in the heart of the city.

End Location Preferences

# Creating a data frame where riders rented bikes in the morning
morning <- total_data %>%
  filter(endtime >= "07:00", endtime <= "12:00")

# Creating a data frame where riders rented bikes in the afternoon
afternoon <- total_data %>%
  filter(endtime > "12:00", endtime <= "16:00")

# Creating a data frame where riders rented bikes in the night
night <- total_data %>%
  filter(endtime > "16:00", endtime <= "20:00")

end <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
end_map <- ggmap(end, extent = "device", legend = "topright")

#Map of the morning riders
morning_map <- end_map + stat_density2d(
  aes(x = end.station.longitude,
      y = end.station.latitude,
      fill = ..level..,
      alpha = ..level..),
  size = 1,
  bins = 5,
  data = morning,
  geom = "polygon") +
  labs(fill = "Density", title = "Morning Riders Map")

#Map of the afternoon riders
afternoon_map <- end_map + stat_density2d(
  aes(x = end.station.longitude,
      y = end.station.latitude,
      fill = ..level..,
      alpha = ..level..),
  size = 1,
  bins = 5,
  data = afternoon,
  geom = "polygon") +
  labs(fill = "Density", title = "Afternoon Riders Map")

#Map of the night riders
night_map <- end_map + stat_density2d(
  aes(x = end.station.longitude,
      y = end.station.latitude,
      fill = ..level..,
      alpha = ..level..),
  size = 1,
  bins = 5,
  data = night,
  geom = "polygon") +
  labs(fill = "Density", title = "Night Riders Map")

morning_map

afternoon_map

night_map

End Locations by User Type

end_map +
stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
size =2, bins = 10, geom = "polygon", data = total_data) + 
guides(alpha = F) + labs(fill = "Density", title = "End Location Density by Day of Week") +
scale_fill_gradient(low = "yellow", high = "red") +
facet_wrap(~ day_of_week) + 
theme(legend.position = "right")

We see the same pattern breakdown in the end location density as we do the start locations, suggesting that weekends have a more spread out end locations than the weekdays.

3. Assymetric Traffic Analysis

Symmetric vs. Asymmetric Traffic

total_rides = count(total_data)
test = total_data
test$start.station.name = as.character(test$start.station.name)
test$end.station.name = as.character(test$end.station.name)
test <- test[test$start.station.name==test$end.station.name, ]
same_station = count(test)

symmetric = same_station / total_rides
asymmetric = 1 - symmetric

Only 0.0195904% of rides start and end at the same station, which means that 0.9804096% of rides are asymmetric traffic.

Station Popularity

start_popularity = sort(table(total_data$start.station.name), decreasing=TRUE, na.rm=TRUE)
top10 = round(length(unique(total_data$start.station.name, na.rm=TRUE))*0.1)
top_10 = head(start_popularity, top10)
barplot(top_10)

top_starts = as.data.frame(top_10)
top_10rides = sum(top_starts$Freq)

top10_rides = top_10rides / total_rides

0.3290259 of bike rides start come from the top 10% most used station (which are:{r} top_10). Understanding this, it would be useful to focus on bike availability / station maintenance for the few most frequented stations, as they account for a substantial portion of traffic.

Flow of Traffic per Station

count_starts = as.data.frame(table(total_data$start.station.name), na.rm=TRUE)
names(count_starts) = c("station", "starts")
count_ends = as.data.frame(table(total_data$end.station.name), na.rm=TRUE)
names(count_ends) = c("station", "ends")
station_flow = as.data.frame(merge(count_starts, count_ends, by.x="station", by.y="station", all.x=TRUE, na.rm=TRUE))
station_flow$net = station_flow$starts / station_flow$ends

station_flow = na.omit(station_flow)

station_flow %>% mutate(station = fct_reorder(station, net)) %>% ggplot(aes(x=station, y=net)) + geom_bar(stat = "identity")+ geom_hline(yintercept=1, linetype="dashed", color = "red") + labs(x="Stations", y="Total Starts / Total Ends in 2019", title = "Ratio of Start and End Count by Station") + theme(axis.text.x = element_blank())

The chart above depicts each station’s inflow/outflow of bikes in 2019. Those with a value greater than 1 show that they have a higher rate of bikes starting at their station than ending at their station. These stations would be important to target when thinking about rebalancing bikes, as they overall have more bikes leaving them then coming to them. Similarly, those with the lowest start/end ratios have more bikes ending at their station than leaving, making them prime candidates for moving their surplus to a station in more need.

Bike Surplus and Deficit Stations

surplus_stations = station_flow[station_flow$net < 0.75,]
deficit_stations = station_flow[station_flow$net > 1.25,]

Stations with surplus:
12 Ave & W 125 St, 23 Ave & 27 St, 31 Ave & Crescent St, 31 St & Northern Blvd, 34 Ave & 13 St, 34 St & 35 Ave, 44 Dr & Jackson Ave, 5 St & 51 Ave, 5 St & Market St, Adam Clayton Powell Blvd & W 118 St, Adam Clayton Powell Blvd & W 126 St, Amsterdam Ave & W 125 St, Battery Pl & Greenwich St, Bedford Ave & Montgomery St, Bushwick Ave & Forrest St, Bushwick Ave & Harman St, Carroll St & Bond St, Carroll St & Franklin Ave, Clinton St & Centre St, Columbia St & W 9 St, Commerce St & Van Brunt St, DeKalb Ave & Hudson Ave, DeKalb Ave & Skillman St, Ditmars Blvd & 19 St, Division Ave & Hooper St, Division St & Bowery (old), Dwight St & Van Dyke St, E 103 St & Lexington Ave, E 118 St & 1 Ave, E 118 St & Madison Ave, E 35 St & 3 Ave, E 71 St & 2 Ave, E 98 St & Park Ave, Garfield Pl & 8 Ave, Halsey St & Broadway, Harrison Pl & Porter Ave, Hart St & Wyckoff Ave, Kingsland Ave & Nassau Ave, Lafayette St & Jersey St S, Lenox Ave & W 115 St, Marcus Garvey Blvd & Macon St, Morningside Dr & Amsterdam Ave, Pearl St & Anchorage Pl, Railroad Ave & Kay Ave, Richards St & Delavan St, Stagg St & Morgan Ave, Stewart Ave & Johnson Ave, Suydam St & St. Nicholas Ave, Throop Ave & Myrtle Ave, Union St & Bedford Ave, W 129 St & Convent Ave, W 47 St & 6 Ave, Wilson Ave & Troutman St, Withers St & Kingsland Ave, Wyckoff Av & Jefferson St

Stations with deficit: 1 Ave & E 5 St, 10 Hudson Yards, 11 St & 35 Ave, 2 Ave & 9 St, 21 St & 36 Ave, 21 St & 38 Ave, 24 Ave & 26 St, 27 Ave & 3 St, 27 Ave & 9 St, 28 Ave & 35 St, 28 St & 36 Ave, 3 Ave & E 95 St, 30 Ave & 21 St, 31 Ave & 30 St, 34 Ave & 21 St, 35 Ave & 10 St, 37 St & 24 Ave, 4 Ave & 2 St, 40 Ave & 9 St, 40 Ave & Crescent St, 44 Dr & 21 St, 47 Ave & 31 St, 5 Ave & E 103 St, 6 Ave & Spring St, Adam Clayton Powell Blvd & W 115 St, Atlantic Ave & Furman St, Avenue D & E 8 St, Bedford Ave & S 9 St, Bergen St & Flatbush Ave, Bergen St & Vanderbilt Ave, Broadway & Hancock St, Broadway & Roebling St, Bushwick Ave & McKibbin St, Bushwick Ave & Powers St, Bushwick Ave & Stagg St, Butler St & Court St, Calyer St & Jewel St, Cedar St & Myrtle Ave, Center Blvd & 48 Ave, Central Ave & Flushing Ave, Central Ave & Starr Street, Central Park North & Adam Clayton Powell Blvd, Clermont Ave & Park Ave, Cliff St & Fulton St (Old), Clinton Ave & Myrtle Ave, Court St & Nelson St, Dock 72 Way & Market St, Douglass St & 3 Ave, Duane St & Greenwich St, E 114 St & 1 Ave, E 123 St & Lexington Ave, E 5 St & Avenue C, E 53 St & Lexington Ave, E 58 St & 1 Ave (NW Corner), E 82 St & East End Ave, E 88 St & Park Ave, E 95 St & 3 Ave, E 97 St & Madison Ave, Frost St & Meeker Ave, Fulton St & Irving Pl, Fulton St & Rockwell Pl, Gold St & Frankfort St, Grand Army Plaza & Plaza St West, Halsey St & Tompkins Ave, Jackson Ave & 46 Rd, Jefferson St & Cypress Ave, Knickerbocker Ave & George St, Knickerbocker Ave & Thames St, Lafayette Ave & Classon Ave, Lafayette Ave & St James Pl, Lefferts Pl & Franklin Ave, Madison Ave & E 120 St, Manhattan Av & Leonard St, Marcy Ave & Lafayette Ave, McKibbin St & Manhattan Ave, Melrose St & Broadway, Menahan St & Central Ave, Monroe St & Tompkins Ave, Montgomery St & Franklin Ave, N 11 St & Kent Ave, Newton Rd & 44 St, Park Pl & Vanderbilt Ave, Perry St & Greenwich Ave, Powers St & Olive St, Prospect Park West & 8 St, Putnam Ave & Knickerbocker Ave, Rivington St & Ridge St, Rogers Ave & Sterling St, S 4 St & Wythe Ave, St Nicholas Ave & Manhattan Ave, Stagg St & Union Ave, Suydam St & Knickerbocker Ave, W 106 St & Central Park West, W 113 St & Broadway, W 116 St & Broadway, W 12 St & W 4 St, W 37 St & 10 Ave, W 42 St & 6 Ave, W 88 St & West End Ave, Warren St & Court St, Waterbury St & Stagg St, Willoughby Ave & Hall St, Willoughby Ave & Myrtle Ave, Wolcott St & Dwight St

4. Weather Analysis

Trip Duration with Rain

# Range of all bike rides affected by rain
total_data_rain <- filter(total_data, SNOW == 0, PRCP > 0) %>%
# min range of tripduration 
    summarise(duration_range_min = min(tripduration, na.rm=TRUE), 
# max range of tripduration
            duration_range_max = max(tripduration, na.rm=TRUE), 
# average length of bike ride affected by rain
            duration_mean = mean(tripduration, na.rm=TRUE), 
# standard deviation of bike ride affected by rain
            duration_sd = sd(tripduration, na.rm=TRUE))

Trip Duration with Snow

# Range of all bike rides affected by snow
total_data_snow <- filter(total_data, SNOW > 0) %>%
# min range of tripduration
      summarise(duration_range_min = min(tripduration, na.rm=TRUE), 
# max range of tripduration
      duration_range_max = max(tripduration, na.rm=TRUE), 
# average length of bike ride affected by snow
      duration_mean = mean(tripduration, na.rm=TRUE), 
# standard deviation of bike ride affected by snow
      duration_sd = sd(tripduration, na.rm=TRUE)) 

Trip Duration with Wind

# Range of all bike rides affected by wind
total_data_wind <- filter(total_data, SNOW == 0, PRCP == 0, AWND > 0) %>% 
# min range of tripduration
      summarise(duration_range_min = min(tripduration, na.rm=TRUE), 
# max range of tripduration
      duration_range_max = max(tripduration, na.rm=TRUE), 
# average length of bike ride affected by snow
      duration_mean = mean(tripduration, na.rm=TRUE), 
# standard deviation of bike ride affected by snow
      duration_sd = sd(tripduration, na.rm=TRUE)) 
# Combine above dataframes into one dataframe for side-by-side comparison
dataframe_list = list("Total Data" = trip_duration_stats,
                      "Rain Data" = total_data_rain,
                      "Snow Data" = total_data_snow,
                      "Wind Data" = total_data_wind)
# Can also do rbind(trip_duration_stats, total_data_rain, etc) but this keeps source table names defined above
do.call(rbind, dataframe_list)
##            duration_range_min duration_range_max duration_mean duration_sd
## Total Data                 61               3599      789.3410    587.4150
## Rain Data                  61               3598      777.5114    575.1325
## Snow Data                  62               3548      660.3067    525.4768
## Wind Data                  61               3599      816.5905    601.8395

We set the max time at 3600 seconds, as any time after that seemed to be a bike that hadn’t been properly returned to a station. The average trip time for all riders, regardless of weather effects is 789.3410085. The average time for trip duration when it’s raining is 777.5114014. This is is smaller than the trip duration for all riders regardless of weather, meaning people are probably trying to get out of the rain quickly and not going on leisurely rides. The average time for trip duration when it’s snowing is 660.3067024. This is is also smaller than average trip duration for all riders, regardless of weather. This time is even shorter than people riding in the rain. This most likely indicates that people are riding for pure necessity and not for fun. The average time for trip duration when it’s windy is 816.5905384. This average time is substantially longer than ll other weather types. Most likely doesn’t deter people from riding bikes, but the wind pushing against riders most likely increases their trip duration.

Types of Weather per Month

# Average rain per month
total_data %>%
  filter(SNOW == 0) %>%
  group_by(Month) %>% 
  summarise(average_rain = mean(PRCP, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##    Month average_rain
##    <chr>        <dbl>
##  1 01          0.0790
##  2 02          0.0668
##  3 03          0.0631
##  4 04          0.122 
##  5 05          0.131 
##  6 06          0.149 
##  7 07          0.158 
##  8 08          0.100 
##  9 09          0.0217
## 10 10          0.123 
## 11 11          0.0386
## 12 12          0.170
# Average snow per month
total_data %>% 
  group_by(Month) %>% 
  summarise(average_snow = mean(SNOW, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##    Month average_snow
##    <chr>        <dbl>
##  1 01          0.0303
##  2 02          0.0567
##  3 03          0.189 
##  4 04          0     
##  5 05          0     
##  6 06          0     
##  7 07          0     
##  8 08          0     
##  9 09          0     
## 10 10          0     
## 11 11          0     
## 12 12          0.0675
# Average wind speed per month
total_data %>%
  group_by(Month) %>% 
  summarise(average_wind_speed = mean(AWND, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##    Month average_wind_speed
##    <chr>              <dbl>
##  1 01                NaN   
##  2 02                NaN   
##  3 03                  4.92
##  4 04                  4.35
##  5 05                  3.73
##  6 06                  4.11
##  7 07                  3.41
##  8 08                  3.85
##  9 09                  4.29
## 10 10                  5.25
## 11 11                  5.30
## 12 12                  6.34
# mean returns NaN if all values in group (ex: jan and feb) are NA

Average Rain by Age

# Trip duration by age of riders and rain amount
plot_data <- total_data %>%
  filter(SNOW == 0) %>%
  group_by(age) %>%
  summarise(mean_PRCP_by_age = mean(PRCP),
            mean_duration = mean(tripduration)) 
## `summarise()` ungrouping output (override with `.groups` argument)
plot_data %>%
  ggplot(aes(x = age, y = mean_PRCP_by_age)) +
  geom_point(alpha =0.9, shape = 18, colour = "blue", size = plot_data$mean_duration/150) +
  geom_smooth(colour = "orange") 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Average Wind by Age

# Mean Wind by Age of Rider
total_data %>% 
  group_by(age) %>%
  summarise(mean_AWND_by_age = mean(AWND,na.rm = TRUE)) %>%
  ggplot(aes(x = age, y = mean_AWND_by_age)) + geom_line() + geom_smooth() 
## `summarise()` ungrouping output (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Rain Effects on Trip Duration

# Average ride time when it's raining
total_data %>%
  filter(PRCP > 0, SNOW == 0) %>%
  summarise(prcp_duration_mean = mean(tripduration))
##   prcp_duration_mean
## 1           777.5114
total_data %>% 
  filter(PRCP > 0, SNOW == 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(colour="black", fill="white") +
  geom_vline(aes(xintercept=mean(tripduration)), color="blue", linetype="dashed", size=1) +
  labs(title="Histogram of Rider Time Affected by Rain", x="Trip Duration", y="Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution is skewed left in this graph. This makes sense as people will not want to be out in the rain long while riding their bike.

Snow Effects on Trip Duration

# Average ride time when it's snowing
total_data %>%
  filter(SNOW > 0) %>%
  summarise(snow_duration_mean = mean(tripduration))
##   snow_duration_mean
## 1           660.3067
total_data %>% 
  filter(SNOW > 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(colour="black", fill="white") +
  geom_vline(aes(xintercept=mean(tripduration)), color="blue", linetype="dashed", size=1) +
  labs(title="Histogram of Rider Time Affected by Snow", x="Trip Duration", y="Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution is still skewed left, but we can see that even more bike rides are shorter because fewer people want to be riding a bike in the snow.

Wind Effects on Trip Duration

# Average ride time when it's windy
total_data %>%
  filter(SNOW == 0, PRCP == 0, AWND > 0) %>%
  summarise(wind_duration_mean = mean(tripduration))
##   wind_duration_mean
## 1           816.5905
total_data %>% 
  filter(SNOW == 0, PRCP == 0, AWND > 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(colour="black", fill="white") +
  geom_vline(aes(xintercept=mean(tripduration)), color="blue", linetype="dashed", size=1) +
  labs(title="Histogram of Rider Time Affected by Wind", x="Trip Duration", y="Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution is still skewed left, but the time it takes to complete a bike ride has actually gone up. This may be because the wind doesn’t deter them from riding bikes, but it does increase their ride time from the wind slowing them down.

Weather Effects on Number of Rides over Average Ride Time

# Number of rides over average time sin weather effects
ride_num <- total_data %>%
  filter(tripduration > trip_duration_stats[1,3]) %>%
  count()
ride_num[1,1]
## [1] 37200
# Number of rides over average time with rain
rain_num <- total_data %>%
  filter(SNOW == 0, PRCP > 0, tripduration > trip_duration_stats[1,3]) %>%
  count()
rain_num[1,1]
## [1] 12452
# Number of rides over average time with snow
snow_num <- total_data %>%
  filter(SNOW > 0, tripduration > trip_duration_stats[1,3]) %>%
  count()
snow_num[1,1]
## [1] 503
# Number of rides over average time with wind
wind_num <- total_data %>%
  filter(AWND > 0, tripduration > trip_duration_stats[1,3]) %>%
  count()
wind_num[1,1]
## [1] 34200